@misc{lake2018scan,
  lp_title={SCAN dataset (compositional generalization)},
  title = {Generalization without Systematicity: {{On}} the Compositional Skills of Sequence-to-Sequence Recurrent Networks},
  author = {Lake, Brenden M. and Baroni, Marco},
  year = {2018},
  eprint = {1711.00350},
  archivePrefix = {arXiv},
  primaryclass = {cs.AI},
  doi = {10.48550/arXiv.1711.00350},
}

@misc{cobbe2021training,
    lp_title={GSM8K},
    title={Training Verifiers to Solve Math Word Problems},
    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Mark Chen and Heewoo Jun and Lukasz Kaiser and Matthias Plappert and Jerry Tworek and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
    year={2021},
    eprint={2110.14168},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}

@misc{yang2018hotpotqa,
    lp_title={hotpotQA},
    title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering},
    author={Zhilin Yang and Peng Qi and Saizheng Zhang and Yoshua Bengio and William W. Cohen and Ruslan Salakhutdinov and Christopher D. Manning},
    year={2018},
    eprint={1809.09600},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@inproceedings{roy-roth-2015-solving,
    lp_title={multiarith},
    title = "Solving General Arithmetic Word Problems",
    author = "Roy, Subhro  and
      Roth, Dan",
    booktitle = "Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing",
    month = sep,
    year = "2015",
    address = "Lisbon, Portugal",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D15-1202",
    doi = "10.18653/v1/D15-1202",
    pages = "1743--1752",
}

@misc{thorne2018fever,
    lp_title={fever dataset},
    title={FEVER: a large-scale dataset for Fact Extraction and VERification},
    author={James Thorne and Andreas Vlachos and Christos Christodoulopoulos and Arpit Mittal},
    year={2018},
    eprint={1803.05355},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}

@misc{parrish2021bbq,
    lp_title={bbq},
    title={BBQ: A Hand-Built Bias Benchmark for Question Answering},
    author={Alicia Parrish and Angelica Chen and Nikita Nangia and Vishakh Padmakumar and Jason Phang and Jana Thompson and Phu Mon Htut and Samuel R. Bowman},
    year={2021},
    eprint={2110.08193},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}